{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a9d6c9c2-9487-4a23-94c4-07292ed0f759",
   "metadata": {},
   "outputs": [],
   "source": [
    "from lxml import etree\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from lxml import html\n",
    "import string\n",
    "import time\n",
    "import os\n",
    "from selenium import webdriver\n",
    "from selenium.webdriver.common.keys import Keys\n",
    "from selenium.webdriver.common.by import By\n",
    "from selenium.webdriver.support.ui import WebDriverWait\n",
    "from selenium.webdriver.support import expected_conditions as EC\n",
    "from selenium.webdriver.common.desired_capabilities import DesiredCapabilities\n",
    "from selenium.webdriver.common.action_chains import ActionChains\n",
    "import time\n",
    "#from joblib import Parallel, delayed\n",
    "import re\n",
    "import undetected_chromedriver as uc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e598e874-b6ee-4ae9-8bc2-52280a62cca3",
   "metadata": {},
   "outputs": [],
   "source": [
    "def update_file_list(all_case_num, file_path):\n",
    "    collected = [str(item).strip('.html') for item in os.listdir(file_path) if not(pd.isnull(item)) == True]\n",
    "    file_list = list(set(all_case_num) - set(collected))\n",
    "    print('Collected', 1-len(file_list)/len(all_case_num))\n",
    "    return(file_list)\n",
    "\n",
    "def get_hcaptcha_access(driver, link):\n",
    "    driver.get('http://google.com')\n",
    "    element = driver.find_element(By.LINK_TEXT, \"About\")\n",
    "    ActionChains(driver).key_down(Keys.CONTROL).click(element).key_up(Keys.CONTROL).perform()\n",
    "    driver.switch_to.window(driver.window_handles[1])\n",
    "    time.sleep(1)\n",
    "    driver.get(link)\n",
    "    wait = WebDriverWait(driver, 10)\n",
    "    time.sleep(2)\n",
    "    wait.until(EC.element_to_be_clickable((By.XPATH, '//button[@data-cy=\"setAccessibilityCookie\"]'))).click()\n",
    "    time.sleep(5)\n",
    "    driver.switch_to.window(driver.window_handles[0])\n",
    "    \n",
    "def download_ROA(driver, file_list, file_path):\n",
    "    for case_number in file_list:\n",
    "        case_number = str(case_number)\n",
    "        if case_number.count('-')==2:\n",
    "            try:\n",
    "                driver.find_element(By.XPATH, '//a[@id=\"btnNewSearch\"]').click()\n",
    "            except:\n",
    "                pass\n",
    "            driver.find_element(By.XPATH, '//input[@name=\"txtCaseYR\"]').clear()\n",
    "            driver.find_element(By.XPATH, '//input[@name=\"txtCaseNbr\"]').clear()\n",
    "            case_number1 = case_number.split('-')[0][-2:]\n",
    "            case_number2 = case_number.split('-')[1][-6:]\n",
    "\n",
    "            driver.find_element(By.XPATH, '//input[@name=\"txtCaseYR\"]').send_keys(case_number1)\n",
    "            driver.find_element(By.XPATH, '//input[@name=\"txtCaseNbr\"]').send_keys(case_number2)\n",
    "            driver.find_element(By.XPATH, '//input[@type=\"submit\"]').click()\n",
    "\n",
    "            time.sleep(2)\n",
    "            try:\n",
    "                with open(file_path+case_number+\".html\", \"w\") as f:\n",
    "                    f.write(driver.page_source)\n",
    "            except: \n",
    "                pass\n",
    "            try:\n",
    "                wait = WebDriverWait(driver, 10)\n",
    "                back = wait.until(EC.element_to_be_clickable((By.XPATH, './/a[@id=\"btnNewSearch\"]')))\n",
    "                back.click()\n",
    "            except:\n",
    "                pass\n",
    "\n",
    "def bypass_captcha(driver):\n",
    "    attempts = 0\n",
    "    passed = 0\n",
    "    while attempts<8 and passed==0:\n",
    "        try:\n",
    "            driver.find_element(By.XPATH, '//span[@class=\"checkbox\"]').click()\n",
    "            time.sleep(3)\n",
    "            driver.find_elements(By.XPATH, '//div[@class=\"hcaptcha-box\"]')[1].click()\n",
    "        except:\n",
    "            try:\n",
    "                driver.find_elements(By.XPATH, '//div[@class=\"hcaptcha-box\"]')[1].click()\n",
    "            except:\n",
    "                pass\n",
    "        attempts+=1\n",
    "        passed = len(driver.find_elements(By.XPATH, '//input[@name=\"txtCaseYR\"]'))\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "565b1a39-0d8f-450f-87bc-b149829069de",
   "metadata": {},
   "outputs": [],
   "source": [
    "#if __name__ == \"__main__\":\n",
    "#    case_number_file = 'case_number_MB.xlsx'\n",
    "#    all_case_num = pd.read_excel('C:/Users/'+os.getlogin()+'/Dropbox/C07/2case_number/'+case_number_file)['case_number'].to_list()\n",
    "#    ROA_path = 'C:/Users/'+os.getlogin()+'/Dropbox/C07/3ROAs/'\n",
    "#    ROA_list = update_file_list(all_case_num, file_path)\n",
    "#    driver = uc.Chrome()\n",
    "#    cookies_link = your_hcaptcha_link\n",
    "#    get_hcaptcha_access(driver, cookies_link)\n",
    "#    driver.get('https://www.co.genesee.mi.us/roaccsinq/default.aspx')\n",
    "#    try:\n",
    "#        download_ROA(driver, file_list, file_path)\n",
    "#    except:\n",
    "#        bypass_captcha(driver)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bf88d4aa-624a-4df8-a911-887d2eeb3f0e",
   "metadata": {},
   "outputs": [],
   "source": [
    "case_number_file = 'case_number_MB.xlsx'\n",
    "all_case_num = pd.read_excel('C:/Users/'+os.getlogin()+'/Dropbox/C07/2case_number/'+case_number_file)['case_number'].to_list()\n",
    "ROA_path = 'C:/Users/'+os.getlogin()+'/Dropbox/C07/3ROAs/'\n",
    "ROA_list = update_file_list(all_case_num, ROA_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a2008ee0-b40f-4e99-929c-ae96bf4b2fc1",
   "metadata": {},
   "outputs": [],
   "source": [
    "driver = uc.Chrome()\n",
    "driver.get('https://www.co.genesee.mi.us/roaccsinq/default.aspx')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f627e6a3-1e8f-46b3-8fc8-ecd272e69050",
   "metadata": {},
   "outputs": [],
   "source": [
    "download_ROA(driver, ROA_list, ROA_path)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
